1 Load data

se <- maxQuant("proteinGroups_thorax.xlsx", intensity = "LFQ", type = "xlsx", 
    sheet = "filtered")
## load annotation
cD <- openxlsx::read.xlsx("proteinGroups_thorax.xlsx", sheet = "annotation_filtered")
cD[, "Sample_IDs"] <- make.names(cD[, "Sample_IDs"])
colnames(cD)[colnames(cD) == "Sample_IDs"] <- "name"

## truncate se and write cD to colData(se)
se <- se[, cD$name]
se@colData <- cD %>% DataFrame()
colnames(se) <- se$name

## truncate colnames
colnames(se) <- stringr::str_replace(colnames(se), 
    "LFQ.intensity.1_KL_ProtMet_IO_40min_DDA_", "ProtMet_")
colnames(se) <- stringr::str_replace(colnames(se), 
    "LFQ.intensity.5_KL_nWorkFlow_IO_40min_DDA_", "nWorkFlow_")
colnames(se) <- stringr::str_replace(colnames(se), 
    "LFQ.intensity.5_KL_nWorkFlowt_IO_40min_DDA_", "nWorkFlow")

se$name <- colnames(se)

2 QC

shinyQC(se)

Exclude the following samples: ProtMet_8U04FR_NG_14_B10_1_6020, ProtMet_IJ17TV_NG_12_B8_1_6016.

se <- MatrixQCvis:::selectSampleSE(se, 
    selection = c("ProtMet_8U04FR_NG_14_B10_1_6020", "ProtMet_IJ17TV_NG_12_B8_1_6016"),
    mode = "exclude")

2.1 Translate the protein IDs

Translate Uniprot into Symbol.

library("org.Hs.eg.db")
uniprots <- Rkeys(org.Hs.egUNIPROT)
dict <- AnnotationDbi::select(org.Hs.eg.db, uniprots, "SYMBOL", "UNIPROT")
uniprot <- dict$UNIPROT
symbol <- dict$SYMBOL
names(symbol) <- uniprot

## rename
rownames(se) <- unlist(lapply(rownames(se), 
    function(x) paste(symbol[strsplit(x, split = ";")[[1]]], collapse = ";")))

## remove the features that have no corresponding Symbol
exclude_pattern <- c("NA", "NA;NA", "NA;NA;NA", "NA;NA;NA;NA", 
    "NA;NA;NA;NA;NA", "NA;NA;NA;NA;NA;NA", "NA;NA;NA;NA;NA;NA;NA")
se <- se[!rownames(se) %in% exclude_pattern, ]

3 Data Filtering, Transformation, and Imputation

Perform log-transformation on the data set.

## keep features with more than ca. 60% (18/35) measured values
se <- se[rowSums(!is.na(assay(se))) >= 18 , ]
dim(se)
## [1] 3010   35
## transformation and imputation
a <- assay(se)
a_t <- transformAssay(a, method = "log")
se <- MatrixQCvis:::updateSE(se = se, assay = a_t)

saveRDS(se, file = "SummarizedExperiment_extraction_method_cohort_proteomics.RDS")

4 Run the linear model

cD <- colData(se)
cD$Type_Processing <- paste(cD$Type, cD$Processing, sep = "_")
cD$Individual <- stringr::str_remove(cD$Pseudonym, pattern = "_NG|_TU")
design <- model.matrix(~ 0 + Type_Processing, data = cD)
colnames(design) <- make.names(colnames(design))
cor <- limma::duplicateCorrelation(assay(se), design, block=cD$Individual)
fit <- lmFit(object = assay(se), design = design, block = cD$Individual,
    correlation = cor$consensus)
## Warning: Partial NA coefficients for 57 probe(s)
## create contrasts
contrasts <- makeContrasts(
    autoSP3vsMTBE_SP3 = (Type_ProcessingTU_autoSP3 - Type_ProcessingNG_autoSP3)/2 -
        (Type_ProcessingTU_MTBE_SP3 - Type_ProcessingNG_MTBE_SP3)/2,
    TUvsNG = (Type_ProcessingTU_autoSP3 - Type_ProcessingTU_MTBE_SP3)/2 -
        (Type_ProcessingNG_autoSP3 - Type_ProcessingNG_MTBE_SP3)/2, ## identical to autoSP3vsMTBE_SP3 
    NG_autoSP3vsMTBE_SP3 = Type_ProcessingNG_autoSP3 - Type_ProcessingNG_MTBE_SP3,
    TU_autoSP3vsMTBE_SP3 = Type_ProcessingTU_autoSP3 - Type_ProcessingTU_MTBE_SP3,
    autoSP3 = Type_ProcessingTU_autoSP3 - Type_ProcessingNG_autoSP3,
    MTBE_SP3 = Type_ProcessingTU_MTBE_SP3 - Type_ProcessingNG_MTBE_SP3,
    levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)

## set parameters for differential expression
num <- Inf
p_val <- 1
adj <- "BH"

4.1 autoSP3 vs. MTBE_SP3

We test here the DE proteins between NG and TU and compare the results that we get from the two extraction methods.

Ideally, this should result in few DE proteins. This would indicate that the methods yield similar results.

tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
    coef = "autoSP3vsMTBE_SP3")
rmarkdown::paged_table(tT)
sum(tT[, "adj.P.Val"] < 0.05, na.rm = TRUE)
## [1] 2
tT <- cbind(name = rownames(tT), tT)
volcanoPlot(tT)
tT_autoSP3vsMTBE <- tT

write.table(tT, file = "proteomics_DE_t_autoSP3vsMTBE_SP3.txt", 
    quote = FALSE, sep = "\t")

4.2 autoSP3 vs. MTBE_SP3 (NG)

We test here the DE proteins from the two extraction methods looking only at NG.

Ideally, this should result in few DE proteins. This would indicate that the methods yield similar results.

tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
    coef = "NG_autoSP3vsMTBE_SP3")
rmarkdown::paged_table(tT)
sum(tT[, "adj.P.Val"] < 0.05, na.rm = TRUE)
## [1] 809
tT <- cbind(name = rownames(tT), tT)
volcanoPlot(tT)
tT_NG <- tT

write.table(tT, file = "proteomics_DE_t_NG_autoSP3vsMTBE_SP3.txt", 
    quote = FALSE, sep = "\t")

4.3 autoSP3 vs. MTBE_SP3 (TU)

We test here the DE proteins from the two extraction methods looking only at TU

Ideally, this should result in few DE proteins. This would indicate that the methods yield similar results.

tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
    coef = "TU_autoSP3vsMTBE_SP3")
rmarkdown::paged_table(tT)
sum(tT[, "adj.P.Val"] < 0.05, na.rm = TRUE)
## [1] 553
tT <- cbind(name = rownames(tT), tT)
volcanoPlot(tT)
tT_TU <- tT

write.table(tT, file = "proteomics_DE_t_TU_autoSP3vsMTBE_SP3.txt", 
    quote = FALSE, sep = "\t")

Ok, this is interesting, when looking at the individual tissues, there are quite many differences, but the differences “vanish” when we combine the different conditions (autoSP3 vs. MTBE_SP3).

4.4 Overlap between DE proteins

We will continue and check the overlap of DE proteins between the contrasts

- autoSP3 vs. MTBE_SP3 (NG)
- autoSP3 vs. MTBE_SP3 (TU)
- autoSP3 vs. MTBE_SP3 (TU-NG)
## only continue with the significant ones
NG_sign <- tT_NG[tT_NG$adj.P.Val < 0.05 & !is.na(tT_NG$adj.P.Val), ]
dim(NG_sign)
## [1] 809   8
TU_sign <- tT_TU[tT_TU$adj.P.Val < 0.05 & !is.na(tT_TU$adj.P.Val), ]
dim(TU_sign)
## [1] 553   8
autoSP3vsMTBE_sign <- tT_autoSP3vsMTBE[tT_autoSP3vsMTBE$adj.P.Val < 0.05 & !is.na(tT_autoSP3vsMTBE$adj.P.Val), ]
dim(autoSP3vsMTBE_sign)
## [1] 2 8
## create list
l <- list(NG = rownames(NG_sign), TU = rownames(TU_sign), 
    autoSP3vsMTBE_SP3 = rownames(autoSP3vsMTBE_sign))
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")

pdf("UpSet_NAT_TT_autoSP3vsMTBESP3.pdf")
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")
dev.off()
## png 
##   2

We will continue and check the overlap of DE proteins between the contrasts:

- TU vs. NG (autoSP3)
- TU vs. NG (MTBE_SP3)
## autoSP3
tT_autoSP3 <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
    coef = "autoSP3")
sum(tT_autoSP3$adj.P.Val < 0.05, na.rm = TRUE)
## [1] 1386
tT_autoSP3 <- cbind(name = rownames(tT_autoSP3), tT_autoSP3)
volcanoPlot(tT_autoSP3)
write.table(tT_autoSP3, file = "proteomics_DE_t_TUvsNG_autoSP3.txt", 
    quote = FALSE, sep = "\t")

## MTBE_SP3
tT_MTBE_SP3 <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
    coef = "MTBE_SP3")
rmarkdown::paged_table(tT_MTBE_SP3)
sum(tT_MTBE_SP3$adj.P.Val < 0.05, na.rm = TRUE)
## [1] 1382
tT_MTBE_SP3 <- cbind(name = rownames(tT_MTBE_SP3), tT_MTBE_SP3)
volcanoPlot(tT_MTBE_SP3)
write.table(tT_MTBE_SP3, file = "proteomics_DE_t_TUvsNG_MTBE_SP3.txt", 
    quote = FALSE, sep = "\t")

## only continue with the significant ones 
autoSP3_sign <- tT_autoSP3[tT_autoSP3$adj.P.Val < 0.05 & !is.na(tT_autoSP3$adj.P.Val), ]
MTBE_SP3_sign <- tT_MTBE_SP3[tT_MTBE_SP3$adj.P.Val < 0.05  & !is.na(tT_MTBE_SP3$adj.P.Val), ]

## create list
l <- list(autoSP3 = autoSP3_sign$ID, MTBE_SP3 = MTBE_SP3_sign$ID)
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")

pdf("UpSet_NATvsTT_autoSP3_MTBESP3.pdf")
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")
dev.off()
## png 
##   2
## Can we say anything about the magnitude of the difference? E.g. the
## overlapping 1004 proteins show on average a larger fold change than those
## detected by only one of the methods? Show in a box plot.
mtbe_auto_shared <- l$autoSP3[l$autoSP3 %in% l$MTBE_SP3]
mtbe_unique <- l$MTBE_SP3[!(l$MTBE_SP3 %in% l$autoSP3)]
auto_unique <- l$autoSP3[!(l$autoSP3 %in% l$MTBE_SP3)]

df_mtbe_shared <- data.frame(
    logFC = tT_MTBE_SP3[tT_MTBE_SP3$ID %in% mtbe_auto_shared, "logFC"],
    set = "shared", type = "MTBE-SP3")
df_auto_shared <- data.frame(
    logFC = tT_autoSP3[tT_autoSP3$ID %in% mtbe_auto_shared, "logFC"],
    set = "shared", type = "autoSP3")
df_mtbe_unique <- data.frame(
    logFC = tT_MTBE_SP3[tT_MTBE_SP3$ID %in% mtbe_unique, "logFC"],
    set = "unique", type = "MTBE-SP3")
df_auto_unique <- data.frame(
    logFC = tT_autoSP3[tT_autoSP3$ID %in% auto_unique, "logFC"],
    set = "unique", type = "autoSP3")
df_all <- rbind(df_mtbe_shared, df_auto_shared, df_mtbe_unique, df_auto_unique)

g <- ggplot(df_all) +
    geom_beeswarm(aes(y = logFC, x = type), size = 0.5, alpha = 0.5) +
    facet_wrap(~ set + type, ncol = 4, scales = "free_x") +
    theme_classic()
ggsave(filename = "BeeSwarm_NATvsTT_autoSP3_MTBESP3.pdf", plot = g)
## Saving 7 x 5 in image
df_tmp <- df_all
df_tmp$logFC <- abs(df_tmp$logFC)
wilcox.test(
    df_tmp[df_tmp$set == "shared" & df_tmp$type == "autoSP3", "logFC"],
    df_tmp[df_tmp$set == "unique" & df_tmp$type == "autoSP3", "logFC"],
    alternative = "greater")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  df_tmp[df_tmp$set == "shared" & df_tmp$type == "autoSP3", "logFC"] and df_tmp[df_tmp$set == "unique" & df_tmp$type == "autoSP3", "logFC"]
## W = 239420, p-value = 4.105e-13
## alternative hypothesis: true location shift is greater than 0
wilcox.test(
    df_tmp[df_tmp$set == "shared" & df_tmp$type == "MTBE-SP3", "logFC"],
    df_tmp[df_tmp$set == "unique" & df_tmp$type == "MTBE-SP3", "logFC"],
    alternative = "greater")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  df_tmp[df_tmp$set == "shared" & df_tmp$type == "MTBE-SP3", "logFC"] and df_tmp[df_tmp$set == "unique" & df_tmp$type == "MTBE-SP3", "logFC"]
## W = 230510, p-value = 3.588e-10
## alternative hypothesis: true location shift is greater than 0
## TU vs NG. (taking into account baseline autoSP3/MTBE-SP3)
tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
    coef = "TUvsNG")
rmarkdown::paged_table(tT)
tT <- cbind(name = rownames(tT), tT)
volcanoPlot(tT)
tT_TUvsNG <- tT

## plot t-values
df <- data.frame(vals_x = tT_autoSP3[rownames(tT_autoSP3), "t"], 
        vals_y = tT_MTBE_SP3[rownames(tT_autoSP3), "t"],
        vals_z = tT_TUvsNG[rownames(tT_autoSP3), "t"])
rownames(df) <- rownames(tT_autoSP3)

## t-values(autoSP3) vs t-values (MTBE-SP3)
g <- ggplot(df, aes_string(x = "vals_x", y = "vals_y")) +
    geom_point(aes(alpha = 0.7)) +
    xlab("t-values (autoSP3)") + ylab("t-values (MTBE-SP3)") +
    ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho") +
    ggtitle("NAT vs. TT") +
    theme_classic() +
    theme(legend.position = "none")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
g 
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Warning: Removed 57 rows containing missing values (`geom_point()`).

ggsave(g, filename = "scatter_NATvsTT_autoSP3_MTBESP3.pdf")
## Saving 7 x 5 in image
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).
cor.test(tT_autoSP3[rownames(tT_autoSP3), "t"],
    tT_MTBE_SP3[rownames(tT_autoSP3), "t"], method = "spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  tT_autoSP3[rownames(tT_autoSP3), "t"] and tT_MTBE_SP3[rownames(tT_autoSP3), "t"]
## S = 711496290, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8342194
## t-values(autoSP3) vs t-values (AutoSP3vsMTBE-SP3)
g <- ggplot(df, aes_string(x = "vals_x", y = "vals_z")) +
    geom_point(aes(alpha = 0.7)) +
    xlab("t-values (autoSP3)") + ylab("t-values (autoSP3/MTBE-SP3)") +
    ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho") +
    ggtitle("NAT vs. TT") +
    theme_classic() +
    theme(legend.position = "none")
g 
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).

ggsave(g, filename = "scatter_NATvsTT_autoSP3_autoSP3vsMTBESP3.pdf")
## Saving 7 x 5 in image
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).
## t-values(MTBE-SP3) vs t-values (AutoSP3vsMTBE-SP3)
g <- ggplot(df, aes_string(x = "vals_y", y = "vals_z")) +
    geom_point(aes(alpha = 0.7)) +
    xlab("t-values (MTBE-SP3)") + ylab("t-values (autoSP3/MTBE-SP3)") +
    ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho") +
    ggtitle("NAT vs. TT") +
    theme_classic() +
    theme(legend.position = "none")
g 
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).

ggsave(g, filename = "scatter_NATvsTT_MTBESP3_autoSP3vsMTBESP3.pdf")
## Saving 7 x 5 in image
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).

4.4.1 GO enrichment analysis TU vs NG (autoSP3)

Run now the GO enrichment tests. Use the raw (unadjusted) values and return the GO terms for the ontologies Biological Process (BP), Molecular Function (MF) and Cellular Component (CC). Perform a Fisher test with the significant features (\(\alpha\) < 0.05).

For autoSP3.

## [1] "## BP"
## [1] "## MF"
## [1] "## CC"
## [1] "## BP mitochondria:"
## [1] 7
## [1] "## MF mitochondria:"
## [1] 0
## [1] "## CC mitochondria:"
## [1] 1

4.4.2 GO enrichment analysis TU vs NG (MTBE-SP3)

Run now the GO enrichment tests. Use the raw (unadjusted) values and return the GO terms for the ontologies Biological Process (BP), Molecular Function (MF) and Cellular Component (CC). Perform a Fisher test with the significant features (\(\alpha\) < 0.05).

For MTBE-SP3.

## [1] "## BP"
## [1] "## MF"
## [1] "## CC"
## [1] "## BP mitochondria:"
## [1] 0
## [1] "## MF mitochondria:"
## [1] 0
## [1] "## CC mitochondria:"
## [1] 0

4.5 Individuals as covariates

cD <- colData(se)
cD$Type_Processing <- paste(cD$Type, cD$Processing, sep = "_")
cD$Individual <- stringr::str_remove(cD$Pseudonym, pattern = "_NG|_TU")
design <- model.matrix(~ 0 + Type_Processing + Individual, data = cD)
fit <- lmFit(object = assay(se), design = design)
## Warning: Partial NA coefficients for 592 probe(s)
## create contrasts
contrasts <- makeContrasts(
    autoSP3 = Type_ProcessingTU_autoSP3 - Type_ProcessingNG_autoSP3,
    MTBE_SP3 = Type_ProcessingTU_MTBE_SP3 - Type_ProcessingNG_MTBE_SP3,
    levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)

## set parameters for differential expression
num <- Inf
p_val <- 1
adj <- "BH"

## get the features for autoSP3
tT_autoSP3_covariate <- topTable(fit_eB, number = num, p.value = p_val, 
    adjust.method = adj, coef = "autoSP3")

cor(x = tT_autoSP3_covariate$t[order(rownames(tT_autoSP3_covariate))],
    y = tT_autoSP3$t[order(rownames(tT_autoSP3))], 
    use = "pairwise.complete.obs")
## [1] 0.9756756
## get the features for MTBE_SP3
tT_MTBE_SP3_covariate <- topTable(fit_eB, number = num, p.value = p_val, 
    adjust.method = adj, coef = "MTBE_SP3")

cor(x = tT_MTBE_SP3_covariate$t[order(rownames(tT_MTBE_SP3_covariate))],
    y = tT_MTBE_SP3$t[order(rownames(tT_MTBE_SP3))], 
    use = "pairwise.complete.obs")
## [1] 0.9725197

4.6 Tissue-heterogeneity

We test here the effect of tissue heterogeneity within the autoSP3 and MTBE-SP3 samples. Therefore, we randomly select 50% of the samples from within autoSP3 and MTBE-SP3 samples and run linear models within each extraction method.

In case of tissue heterogeneity, there should be some proteins that are differentially abundant.

se$Patient_ID <- stringr::str_remove(se$Pseudonym, pattern = "_TU|_NG")

##
## MTBE-SP3
se_mtbesp3 <- se[, se$Processing == "MTBE_SP3"]
ids_mtbesp3 <- se_mtbesp3$Patient_ID |> unique()
set.seed(2023)
ids_group1 <- sample(ids_mtbesp3, size = floor(length(ids_mtbesp3)/ 2),
    replace = FALSE)
se_mtbesp3$random_group <- ifelse(se_mtbesp3$Patient_ID %in% ids_group1, "1", "2")

## run the linear model, adjust for the tissue effect
design <- model.matrix(~ 0 + random_group + Type, data = colData(se_mtbesp3))
colnames(design) <- make.names(colnames(design))
fit <- lmFit(object = assay(se_mtbesp3), design = design)
## Warning: Partial NA coefficients for 17 probe(s)
## make contrasts
contrasts <- makeContrasts(
    group1_vs_group2 = random_group1 - random_group2,
    levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)

## get the features for MTBE-SP3
tT_MTBESP3 <- topTable(fit_eB, number = num, p.value = p_val, 
    adjust.method = adj)
sum(tT_MTBESP3[, "adj.P.Val"] < 0.05)
## [1] NA
rmarkdown::paged_table(tT_MTBESP3)
##
## autoSP3
se_autosp3 <- se[, se$Processing == "autoSP3"]
ids_autosp3 <- se_autosp3$Patient_ID |> unique()
set.seed(2023)
ids_group1 <- sample(ids_autosp3, size = floor(length(ids_autosp3)/ 2), 
    replace = FALSE)
se_autosp3$random_group <- ifelse(se_autosp3$Patient_ID %in% ids_group1, "1", "2")

## run the linear model, adjust for the tissue effect
design <- model.matrix(~ 0 + random_group + Type , data = colData(se_autosp3))
colnames(design) <- make.names(colnames(design))
fit <- lmFit(object = assay(se_autosp3), design = design)
## Warning: Partial NA coefficients for 46 probe(s)
## make contrasts
contrasts <- makeContrasts(
    group1_vs_group2 = random_group1 - random_group2,
    levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)

## get the features for MTBE-SP3
tT_autoSP3 <- topTable(fit_eB, number = num, p.value = p_val, 
    adjust.method = adj)
sum(tT_autoSP3[, "adj.P.Val"] < 0.05)
## [1] 0
rmarkdown::paged_table(tT_autoSP3)

  1. European Molecular Biology Laboratory, Meyerhofstrasse 1, 69117 Heidelberg, Germany↩︎